Here is a project to promote the food safety in Chicago. My goal is to create classification model, predicting the outcome of food safety inspection based on the inspectors’ comments.
import pandas as pd
import requests
import re
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
%%time
# Define the API endpoint and parameters
url = "https://data.cityofchicago.org/resource/cwig-ma7x.json"
# Fetch the total count of records
def get_total_count(url):
params = {
"$select": "count(*)"
}
response = requests.get(url, params=params)
data = response.json()
return int(data[0]['count'])
total_count = get_total_count(url)
print(f"Total number of records available: {total_count:,.0f}")
Total number of records available: 267,124 CPU times: total: 46.9 ms Wall time: 584 ms
%%time
# Fetch data and load it into a pandas DataFrame
def fetch_data(url, params):
response = requests.get(url, params=params)
data = response.json()
return pd.DataFrame(data)
# Set the limit parameter equal to the number of available records
params = {
"$limit": total_count
}
# Fetch the data and load it into a DataFrame
df = fetch_data(url, params)
print(f'Number of records retrieved: {df.shape[0]:,.0f}')
Number of records retrieved: 267,124 CPU times: total: 4.75 s Wall time: 27.4 s
df.head(5)
| inspection_id | dba_name | aka_name | license_ | facility_type | risk | address | city | state | zip | ... | results | violations | latitude | longitude | location | :@computed_region_awaf_s7ux | :@computed_region_6mkv_f3dw | :@computed_region_vrxf_vc4k | :@computed_region_bdys_3d7i | :@computed_region_43wa_7qmu | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2588891 | THE LANZAGA | THE LANZAGA | 2951700 | Restaurant | Risk 1 (High) | 3734-3736 N BROADWAY | CHICAGO | IL | 60613 | ... | Fail | 5. PROCEDURES FOR RESPONDING TO VOMITING AND D... | 41.950232386786 | -87.64918094440476 | {'type': 'Point', 'coordinates': [-87.64918094... | 37 | 21186 | 57 | 726 | 39 |
| 1 | 2588914 | SUBWAY | SUBWAY | 2262950 | Restaurant | Risk 1 (High) | 2008 N HALSTED ST | CHICAGO | IL | 60614 | ... | Pass | 51. PLUMBING INSTALLED; PROPER BACKFLOW DEVICE... | 41.91849213778706 | -87.64866909416301 | {'type': 'Point', 'coordinates': [-87.64866909... | 51 | 21190 | 68 | 744 | 34 |
| 2 | 2588885 | CARNICERIA Y FRUTERIA EL MILAGRITO | CARNICERIA Y FRUTERIA EL MILAGRITO | 2930227 | Grocery Store | Risk 2 (Medium) | 5960 W LAWRENCE AVE | CHICAGO | IL | 60630 | ... | Pass | 54. GARBAGE & REFUSE PROPERLY DISPOSED; FACILI... | 41.96776480013477 | -87.777020204254 | {'type': 'Point', 'coordinates': [-87.77702020... | 20 | 21869 | 15 | 94 | 50 |
| 3 | 2588881 | JET'S PIZZA | JET'S PIZZA | 2522268 | Restaurant | Risk 1 (High) | 1025 W MADISON ST | CHICAGO | IL | 60607 | ... | Pass | NaN | 41.88157249576794 | -87.65305233593274 | {'type': 'Point', 'coordinates': [-87.65305233... | 48 | 14917 | 29 | 91 | 26 |
| 4 | 2588848 | HO FAT LEE CHINESE KITCHEN, I NC. | HO FAT LEE CHINESE KITCHEN | 1543266 | Restaurant | Risk 1 (High) | 1114 S KEDZIE AVE | CHICAGO | IL | 60612 | ... | Pass | NaN | 41.86778016933903 | -87.70585900858474 | {'type': 'Point', 'coordinates': [-87.70585900... | 36 | 21184 | 30 | 98 | 14 |
5 rows × 22 columns
clean_data = df.copy()
clean_data.dropna(subset = ['violations'], inplace=True)
def desc(x):
pattern = r"\d+\. (.*?)(?: - Comments:|$)"
parts = re.split(r'\|', x)
res = []
for part in parts:
res = res + re.findall(pattern,part)
return res
def comments(x):
parts = re.split(r'\|',x)
pattern = r'- Comments: (.*)'
res = []
for part in parts:
res = res + re.findall(pattern,part)
return res
clean_data["comments"] = clean_data.loc[:,"violations"].map(comments)
import nltk
import nltk.corpus
from nltk.text import Text
# Ensure NLTK resources are downloaded (e.g., tokenizers, corpora)
nltk.download('punkt')
clean_data["comments_tokenized"] = clean_data["comments"].map(lambda x:[nltk.tokenize.word_tokenize(i) for i in x])
[nltk_data] Downloading package punkt to [nltk_data] C:\Users\10124\AppData\Roaming\nltk_data... [nltk_data] Package punkt is already up-to-date!
nltk.download('stopwords')
from nltk.corpus import stopwords
def clean_tokens(tokens_list):
cleaned_tokens_list = []
# Assuming tokens_list is a list of lists
for tokens in tokens_list:
# Convert to lower case
tokens = [token.lower() for token in tokens]
# Remove stopwords
stop_words = set(stopwords.words('english'))
tokens = [token for token in tokens if token not in stop_words]
# Remove punctuation & numbers (keep only alphabetic tokens)
tokens = [token for token in tokens if token.isalpha()]
cleaned_tokens_list.append(tokens)
return cleaned_tokens_list
# Assuming clean_data['comments_tokenized'] is a list of lists
clean_data['comments_tokenized_clean'] = clean_data['comments_tokenized'].apply(lambda x: clean_tokens(x) if isinstance(x, list) and all(isinstance(lst, list) for lst in x) else [])
[nltk_data] Downloading package stopwords to [nltk_data] C:\Users\10124\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date!
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
wnl = WordNetLemmatizer()
def flatten_and_lemmatize(tokens_list_of_lists):
# Flatten the list of lists into a single list
flattened_list = [token for sublist in tokens_list_of_lists for token in sublist]
# Lemmatize each token in the flattened list
lemmatized_tokens = [wnl.lemmatize(token) for token in flattened_list]
return lemmatized_tokens
# Apply the function to the column that contains a list of lists
clean_data['comments_tokenized_flatten_lemma'] = clean_data['comments_tokenized_clean'].apply(flatten_and_lemmatize)
[nltk_data] Downloading package wordnet to [nltk_data] C:\Users\10124\AppData\Roaming\nltk_data... [nltk_data] Package wordnet is already up-to-date!
clean_data['comments_final'] = clean_data['comments_tokenized_flatten_lemma'].apply(lambda x: ' '.join(map(str, x)))
clean_data.head()
| inspection_id | dba_name | aka_name | license_ | facility_type | risk | address | city | state | zip | ... | :@computed_region_awaf_s7ux | :@computed_region_6mkv_f3dw | :@computed_region_vrxf_vc4k | :@computed_region_bdys_3d7i | :@computed_region_43wa_7qmu | comments | comments_tokenized | comments_tokenized_clean | comments_tokenized_flatten_lemma | comments_final | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2588891 | THE LANZAGA | THE LANZAGA | 2951700 | Restaurant | Risk 1 (High) | 3734-3736 N BROADWAY | CHICAGO | IL | 60613 | ... | 37 | 21186 | 57 | 726 | 39 | [NO WRITTEN CLEANING PROCEDURE OR REQUIRED EQU... | [[NO, WRITTEN, CLEANING, PROCEDURE, OR, REQUIR... | [[written, cleaning, procedure, required, equi... | [written, cleaning, procedure, required, equip... | written cleaning procedure required equipment ... |
| 1 | 2588914 | SUBWAY | SUBWAY | 2262950 | Restaurant | Risk 1 (High) | 2008 N HALSTED ST | CHICAGO | IL | 60614 | ... | 51 | 21190 | 68 | 744 | 34 | [ 5-205.15--- OBSERVED LEAKING AT THE FAUCET B... | [[5-205.15, --, -, OBSERVED, LEAKING, AT, THE,... | [[observed, leaking, faucet, base, rear, sink,... | [observed, leaking, faucet, base, rear, sink, ... | observed leaking faucet base rear sink must re... |
| 2 | 2588885 | CARNICERIA Y FRUTERIA EL MILAGRITO | CARNICERIA Y FRUTERIA EL MILAGRITO | 2930227 | Grocery Store | Risk 2 (Medium) | 5960 W LAWRENCE AVE | CHICAGO | IL | 60630 | ... | 20 | 21869 | 15 | 94 | 50 | [OBSERVED BOTH EXPOSED HANDSINKS IN BUTCHER PR... | [[OBSERVED, BOTH, EXPOSED, HANDSINKS, IN, BUTC... | [[observed, exposed, handsinks, butcher, prep,... | [observed, exposed, handsinks, butcher, prep, ... | observed exposed handsinks butcher prep area w... |
| 5 | 2588819 | COMMON DECENCY FEVER DREAM | COMMON DECENY FEVER DREAM | 2944240 | Restaurant | Risk 1 (High) | 3152-3154 W DIVERSEY AVE | CHICAGO | IL | 60647 | ... | 15 | 22535 | 22 | 465 | 20 | [FOUND NO EMPLOYEE HEALTH POLICY AT PREMISES.P... | [[FOUND, NO, EMPLOYEE, HEALTH, POLICY, AT, PRE... | [[found, employee, health, policy, citation, p... | [found, employee, health, policy, citation, pr... | found employee health policy citation provide ... |
| 6 | 2588828 | TIKAL SABOR CHAPIN | TIKAL SABOR CHAPIN | 2951612 | Restaurant | Risk 1 (High) | 3216 W LAWRENCE AVE | CHICAGO | IL | 60625 | ... | 28 | 21849 | 14 | 750 | 20 | [OBSERVED NO WRITTEN EMPLOYEE HEALTH POLICY ON... | [[OBSERVED, NO, WRITTEN, EMPLOYEE, HEALTH, POL... | [[observed, written, employee, health, policy,... | [observed, written, employee, health, policy, ... | observed written employee health policy premis... |
5 rows × 27 columns
clean_data["results"].unique()
array(['Fail', 'Pass', 'Pass w/ Conditions', 'No Entry', 'Not Ready',
'Out of Business'], dtype=object)
predicting the outcome of inspection – comments are predictors, target variable is “Results” column
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn import metrics
violation = clean_data[["comments_final","results"]]
violation = violation[(violation["results"] == "Pass") | (violation["results"] == "Fail")]
violation.groupby("results").count()
| comments_final | |
|---|---|
| results | |
| Fail | 48730 |
| Pass | 104467 |
violation = violation.groupby("results").sample(n=10000, random_state=1)
violation.reset_index(inplace=True, drop=True)
violation.groupby("results").count()
| comments_final | |
|---|---|
| results | |
| Fail | 10000 |
| Pass | 10000 |
violation['result_flag'] = violation.results.map({'Fail':0, 'Pass':1})
X = violation.comments_final
y = violation.result_flag
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
# Initialize CountVectorizer with the desired ngram range
vect = CountVectorizer(ngram_range=(1, 2))
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)
print(X_train)
18960 metal walk cooler shelf shelving rice cooker d...
11353 food contact surface equipment food storage ut...
7449 observed singed employee health policy site fo...
14308 broken exposed insulation liner inside white c...
11889 exterior grease trap dishroom showing sign det...
...
10955 vent throughout clean instructed clean must cl...
17289 observed rubber gasket door door cooler prep a...
5192 found front exit door inch gap glass door fron...
12172 must defrost deep freezer basement instructed ...
235 noted broken knob exposed hand wash sink cold ...
Name: comments_final, Length: 15000, dtype: object
logreg = LogisticRegression(max_iter=10000)
%time logreg.fit(X_train_dtm, y_train)
CPU times: total: 15.7 s Wall time: 18.7 s
LogisticRegression(max_iter=10000)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression(max_iter=10000)
y_pred_class = logreg.predict(X_test_dtm)
logic_accuracy = metrics.accuracy_score(y_test, y_pred_class)
print(f"Test Accuracy: {logic_accuracy * 100:.1f}%")
Test Accuracy: 96.5%
print(classification_report(y_test, y_pred_class))
precision recall f1-score support
0 0.98 0.95 0.97 2569
1 0.95 0.98 0.96 2431
accuracy 0.96 5000
macro avg 0.96 0.97 0.96 5000
weighted avg 0.97 0.96 0.96 5000
clf = logreg
feature_names = vect.get_feature_names_out()
coefs_with_fns = zip(feature_names, clf.coef_[0])
coefs_with_fns_df = pd.DataFrame(coefs_with_fns,
columns=['feature', 'coefficient'])
coefs_with_fns_df.sort_values(by='coefficient', ascending=True, inplace=True)
coefs_with_fns_df.head(10)
| feature | coefficient | |
|---|---|---|
| 130675 | serious | -3.536678 |
| 35563 | critical | -2.011492 |
| 111673 | priority | -2.010268 |
| 22637 | citation | -1.933869 |
| 160045 | violation | -1.763725 |
| 130853 | serious violation | -1.504526 |
| 76401 | issued | -1.396010 |
| 35666 | critical violation | -1.261778 |
| 160309 | violation observed | -1.183781 |
| 9218 | barrier | -1.140754 |
# instantiate a Multinomial Naive Bayes model
nb = MultinomialNB()
# train and time the model using X_train_dtm
%time nb.fit(X_train_dtm, y_train)
CPU times: total: 31.2 ms Wall time: 32 ms
MultinomialNB()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
MultinomialNB()
# make class predictions for X_test_dtm
y_pred_class = nb.predict(X_test_dtm)
# calculate accuracy of class predictions
bayes_accuracy = metrics.accuracy_score(y_test, y_pred_class)
print(f"Test Accuracy: {bayes_accuracy * 100:.1f}%")
Test Accuracy: 90.0%
# calculate precision and recall
print(classification_report(y_test, y_pred_class))
precision recall f1-score support
0 0.87 0.94 0.91 2569
1 0.94 0.85 0.89 2431
accuracy 0.90 5000
macro avg 0.90 0.90 0.90 5000
weighted avg 0.90 0.90 0.90 5000
# calculate the confusion matrix
print(metrics.confusion_matrix(y_test, y_pred_class))
[[2426 143] [ 356 2075]]
clf = nb # Assuming 'nb' is an instance of MultinomialNB that's already been fitted
feature_names = vect.get_feature_names_out()
log_prob = clf.feature_log_prob_
# Since we're dealing with log probabilities, you can directly use 'log_prob'
# Assuming interest in the first class for demonstration
coefs_with_fns = zip(feature_names, log_prob[0])
coefs_with_fns_df = pd.DataFrame(coefs_with_fns, columns=['feature', 'log_probability'])
coefs_with_fns_df.sort_values(by='log_probability', ascending=False, inplace=True)
coefs_with_fns_df.head(10)
| feature | log_probability | |
|---|---|---|
| 57449 | food | -4.472817 |
| 93479 | must | -4.477255 |
| 4647 | area | -4.613348 |
| 74781 | instructed | -4.642012 |
| 97701 | observed | -4.706363 |
| 23172 | clean | -4.833233 |
| 135100 | sink | -4.952810 |
| 55384 | floor | -5.028783 |
| 86684 | maintain | -5.048695 |
| 110000 | prep | -5.139704 |
# instantiate a SVM model
svm = SGDClassifier(max_iter=100, tol=None)
# train the model using X_train_dtm
%time svm.fit(X_train_dtm, y_train)
# make class predictions for X_test_dtm
y_pred_class = svm.predict(X_test_dtm)
CPU times: total: 562 ms Wall time: 563 ms
# calculate accuracy of class predictions
svm_accuracy = metrics.accuracy_score(y_test, y_pred_class)
print(svm_accuracy)
0.9462
# calculate precision and recall
print(classification_report(y_test, y_pred_class))
# calculate the confusion matrix
print(metrics.confusion_matrix(y_test, y_pred_class))
precision recall f1-score support
0 0.95 0.94 0.95 2569
1 0.94 0.95 0.95 2431
accuracy 0.95 5000
macro avg 0.95 0.95 0.95 5000
weighted avg 0.95 0.95 0.95 5000
[[2418 151]
[ 118 2313]]
svm
SGDClassifier(max_iter=100, tol=None)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SGDClassifier(max_iter=100, tol=None)
clf = svm
feature_names = vect.get_feature_names_out()
coefs_with_fns = zip(feature_names, clf.coef_[0])
coefs_with_fns_df = pd.DataFrame(coefs_with_fns,
columns=['feature', 'coefficient'])
coefs_with_fns_df.sort_values(by='coefficient', ascending=True, inplace=True)
coefs_with_fns_df.head(10)
| feature | coefficient | |
|---|---|---|
| 130675 | serious | -4.530316 |
| 22637 | citation | -3.157897 |
| 160045 | violation | -3.091275 |
| 111673 | priority | -2.784812 |
| 35563 | critical | -2.564958 |
| 130853 | serious violation | -2.351767 |
| 113761 | provide | -2.318456 |
| 74062 | install | -2.265158 |
| 28702 | connected | -2.045304 |
| 87459 | maintained | -2.011993 |
Elimination of Stop Words: Commonly occurring words such as "and", "the", and "is" are often excluded from the analysis because they typically don't provide valuable insights for predictive models.
Exclusion of Numbers and Single Characters: In instances where not every unstructured comment is linked to a specific code (for example, "4-601.11(C):"), I ensure uniformity by discarding numbers and individual letters, thereby omitting code references from consideration.
Token Lemmatization: To achieve uniformity among tokens, I apply lemmatization, which adjusts tokens to their base forms, enhancing their comparability.
N-grams selection: after hyperparameter pipeline I find that ngram range of (1,2) works better. This may be because that using bigrams (pairs of consecutive words) along with unigrams allows the model to capture more context and the relationship between words, leading to a better understanding of the text's meaning.
from sklearn.pipeline import Pipeline
pipeline = Pipeline(
[
("vect", CountVectorizer()),
("clf", LogisticRegression()),
]
)
pipeline
Pipeline(steps=[('vect', CountVectorizer()), ('clf', LogisticRegression())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('vect', CountVectorizer()), ('clf', LogisticRegression())])CountVectorizer()
LogisticRegression()
parameter_grid = {
'vect__max_df': (0.2, 0.4, 0.6, 0.8, 1.0),
'vect__min_df': (1, 3, 5, 10),
'vect__ngram_range': ((1, 1), (1, 2)), # Unigrams or bigrams
'clf__C': np.logspace(-4, 4, 9), # Inverse of regularization strength
'clf__penalty': ['l1', 'l2'], # Type of regularization
# 'clf__solver': ['liblinear', 'saga'], # Solver, might be required if using 'l1' penalty
}
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV
random_search = RandomizedSearchCV(
estimator=pipeline,
param_distributions=parameter_grid,
n_iter=40,
random_state=0,
n_jobs=2,
verbose=1,
)
print("Performing grid search...")
print("Hyperparameters to be evaluated:")
pprint(parameter_grid)
Performing grid search...
Hyperparameters to be evaluated:
{'clf__C': array([1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03,
1.e+04]),
'clf__penalty': ['l1', 'l2'],
'vect__max_df': (0.2, 0.4, 0.6, 0.8, 1.0),
'vect__min_df': (1, 3, 5, 10),
'vect__ngram_range': ((1, 1), (1, 2))}
from time import time
t0 = time()
random_search.fit(X_train, y_train)
print(f"Done in {time() - t0:.3f}s")
Fitting 5 folds for each of 40 candidates, totalling 200 fits
D:\python\Lib\site-packages\sklearn\model_selection\_validation.py:425: FitFailedWarning:
115 fits failed out of a total of 200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.
Below are more details about the failures:
--------------------------------------------------------------------------------
115 fits failed with the following error:
Traceback (most recent call last):
File "D:\python\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "D:\python\Lib\site-packages\sklearn\base.py", line 1152, in wrapper
return fit_method(estimator, *args, **kwargs)
File "D:\python\Lib\site-packages\sklearn\pipeline.py", line 427, in fit
self._final_estimator.fit(Xt, y, **fit_params_last_step)
File "D:\python\Lib\site-packages\sklearn\base.py", line 1152, in wrapper
return fit_method(estimator, *args, **kwargs)
File "D:\python\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1169, in fit
solver = _check_solver(self.solver, self.penalty, self.dual)
File "D:\python\Lib\site-packages\sklearn\linear_model\_logistic.py", line 56, in _check_solver
raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.
warnings.warn(some_fits_failed_message, FitFailedWarning)
D:\python\Lib\site-packages\sklearn\model_selection\_search.py:979: UserWarning: One or more of the test scores are non-finite: [ nan nan nan 0.92346667 nan nan
0.9522 0.83553333 0.96873333 nan nan nan
nan 0.958 0.94546667 nan 0.95033333 nan
0.95986667 0.95293333 nan nan nan 0.952
nan 0.95126667 nan 0.8808 nan nan
nan 0.96033333 nan 0.931 0.9504 nan
nan nan 0.96726667 0.95 ]
warnings.warn(
Done in 177.851s
D:\python\Lib\site-packages\sklearn\linear_model\_logistic.py:460: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
print("Best parameters combination found:")
best_parameters = random_search.best_estimator_.get_params()
for param_name in sorted(parameter_grid.keys()):
print(f"{param_name}: {best_parameters[param_name]}")
Best parameters combination found: clf__C: 1.0 clf__penalty: l2 vect__max_df: 0.4 vect__min_df: 1 vect__ngram_range: (1, 2)
test_accuracy = random_search.score(X_test, y_test)
print(
"Accuracy of the best parameters using the inner CV of "
f"the random search: {random_search.best_score_:.3f}"
)
print(f"Accuracy on test set: {test_accuracy:.3f}")
Accuracy of the best parameters using the inner CV of the random search: 0.969 Accuracy on test set: 0.965
def shorten_param(param_name):
"""Remove components' prefixes in param_name."""
if "__" in param_name:
return param_name.rsplit("__", 1)[1]
return param_name
cv_results = pd.DataFrame(random_search.cv_results_)
cv_results = cv_results.rename(shorten_param, axis=1)
import plotly.express as px
param_names = [shorten_param(name) for name in parameter_grid.keys()]
labels = {
"mean_score_time": "CV Score time (s)",
"mean_test_score": "CV score (accuracy)",
}
fig = px.scatter(
cv_results,
x="mean_score_time",
y="mean_test_score",
error_x="std_score_time",
error_y="std_test_score",
hover_data=param_names,
labels=labels,
)
fig.update_layout(
title={
"text": "trade-off between scoring time and mean test score",
"y": 0.95,
"x": 0.5,
"xanchor": "center",
"yanchor": "top",
}
)
fig
import math
column_results = param_names + ["mean_test_score", "mean_score_time"]
transform_funcs = dict.fromkeys(column_results, lambda x: x)
transform_funcs["ngram_range"] = lambda x: x[1]
fig = px.parallel_coordinates(
cv_results[column_results].apply(transform_funcs),
color="mean_test_score",
color_continuous_scale=px.colors.sequential.Viridis_r,
labels=labels,
)
fig.update_layout(
title={
"text": "Parallel coordinates plot of text classifier pipeline",
"y": 0.99,
"x": 0.5,
"xanchor": "center",
"yanchor": "top",
}
)
fig
Logistic Regression Model is more robust.
import matplotlib.pyplot as plt
# Classifier names
classifiers = ['Naive Bayes', 'SVM', 'Logistic Regression']
# Corresponding accuracy scores
accuracies = [bayes_accuracy, svm_accuracy, logic_accuracy]
# Creating the enhanced bar plot
plt.figure(figsize=(10, 7))
bars = plt.bar(classifiers, accuracies, color=['skyblue', 'lightgreen', 'salmon'])
# Adding data labels above each bar
for bar in bars:
yval = bar.get_height()
plt.text(bar.get_x() + bar.get_width()/2, yval + 0.005, round(yval, 3), ha='center', va='bottom')
plt.xlabel('Classifier')
plt.ylabel('Accuracy Score')
plt.title('Comparison of Text Classifier Accuracies')
plt.ylim(0, 1.05) # Extend y-axis limit slightly above 1 for better visibility of data labels
plt.yticks([i/10 for i in range(11)]) # Setting y-axis ticks to improve readability
plt.grid(axis='y', linestyle='--', alpha=0.7) # Adding horizontal gridlines for easier comparison
plt.show()